In [1]:
# import sys; sys.path.append('.')
from setup import *
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 200)
In [2]:
print('Loading 200k tweets (including metadata) could take a minute or so...')
df = pd.read_csv(os.path.join(DATA_PATH, 'all_tweets.csv'), index_col='id', low_memory=False,
quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)
# in iPython Notebook print out df.columns to show that many of them contain dots
# rename the columns to be attribute-name friendly
df.columns = [label.replace('.', '_') for label in df.columns]
print('Done.')
Loading 200k tweets (including metadata) could take a minute or so...
Done.
In [3]:
# compress it using python 3 for future loading with python 3
df.to_csv(os.path.join(DATA_PATH, 'all_tweets.csv.gz'), quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, compression='gzip')
# with gzip.open(os.path.join(DATA_PATH, 'all_tweets.csv.gz'), 'w') as fout:
In [9]:
print('The raw table shape is {}'.format(df.shape))
nonnull_rows = 330
nonnull_cols = 50
df = df.dropna(axis=1, thresh=nonnull_rows)
print('After dropping columns with fewer than {} nonnull values, the table shape is {}'.format(nonnull_rows, df.shape))
df = df.dropna(axis=0, thresh=nonnull_cols)
print('After dropping rows with fewer than {} nonnull values, the table shape is {}'.format(nonnull_cols, df.shape))
# in ipython notebook, explore and describe the DataFrame columns
print('Of the {} columns, {} are actually DataFrames'.format(len(df.columns), sum([not isinstance(df[col], pd.Series) for col in df.columns])))
# remove dataframes with only 2 columns and one is the _str of the other:
for col in df.columns:
if isinstance(df[col], pd.DataFrame):
print('Column {} is a {}-wide DataFrame'.format(col, len(df[col].columns)))
if df[col].columns[1] == df[col].columns[0] + '_str':
print('Column {} looks easy because it has sub-columns {}'.format(col, df[col].columns))
df[col] = df[col][df[col].columns[1]]
else:
try:
assert(float(df[col].iloc[:, 0].max()) == float(df[col].iloc[:, 1].max()))
df[col] = df[col].fillna(-1, inplace=False)
series = pd.Series([int(Decimal(x)) for x in df[col].iloc[:, 1].values]).astype('int64').copy()
del df[col]
df[col] = series
print('Finished converting column {} to type {}({})'.format(col, type(df[col]), df[col].dtype))
except:
print_exc()
print('Of the {} columns, {} are still DataFrames after trying to convert both columns to long integers'.format(
len(df.columns), sum([not isinstance(df[col], pd.Series) for col in df.columns])))
The raw table shape is (200168, 285)
After dropping columns with fewer than 330 nonnull values, the table shape is (200168, 285)
After dropping rows with fewer than 50 nonnull values, the table shape is (193378, 285)
Of the 285 columns, 8 are actually DataFrames
Column quoted_status_id is a 2-wide DataFrame
Finished converting column quoted_status_id to type <class 'pandas.core.series.Series'>(float64)
Column quoted_status_id_str is a 2-wide DataFrame
Finished converting column quoted_status_id_str to type <class 'pandas.core.series.Series'>(float64)
Column retweeted_status_quoted_status_id is a 2-wide DataFrame
Finished converting column retweeted_status_quoted_status_id to type <class 'pandas.core.series.Series'>(float64)
Column retweeted_status_quoted_status_id_str is a 2-wide DataFrame
Finished converting column retweeted_status_quoted_status_id_str to type <class 'pandas.core.series.Series'>(float64)
Of the 281 columns, 0 are still DataFrames after trying to convert both columns to long integers
In [10]:
print('df.describe() stats:')
desc = df.describe()
for col, stats in desc.T.iterrows():
print('')
print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
print(stats)
df.describe() stats:
/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
favorite_count (int64)
count 193378.000000
mean 0.629679
std 6.251319
...
50% 0.000000
75% 0.000000
max 1165.000000
Name: favorite_count, dtype: float64
id_str (int64)
count 1.933780e+05
mean 7.274888e+17
std 3.778481e+15
...
50% 7.271970e+17
75% 7.301318e+17
max 7.345639e+17
Name: id_str, dtype: float64
in_reply_to_status_id (float64)
count 1.116500e+04
mean 7.270313e+17
std 1.485877e+16
...
50% NaN
75% NaN
max 7.345608e+17
Name: in_reply_to_status_id, dtype: float64
in_reply_to_status_id_str (float64)
count 1.116500e+04
mean 7.270313e+17
std 1.485877e+16
...
50% NaN
75% NaN
max 7.345608e+17
Name: in_reply_to_status_id_str, dtype: float64
in_reply_to_user_id (float64)
count 1.300700e+04
mean 2.245103e+16
std 1.249313e+17
...
50% NaN
75% NaN
max 7.338248e+17
Name: in_reply_to_user_id, dtype: float64
in_reply_to_user_id_str (float64)
count 1.300700e+04
mean 2.245103e+16
std 1.249313e+17
...
50% NaN
75% NaN
max 7.338248e+17
Name: in_reply_to_user_id_str, dtype: float64
lat (float64)
count 643.000000
mean 33.957111
std 16.306535
...
50% NaN
75% NaN
max 59.800737
Name: lat, dtype: float64
lon (float64)
count 643.000000
mean -57.314729
std 70.435494
...
50% NaN
75% NaN
max 151.735644
Name: lon, dtype: float64
quoted_status_favorite_count (float64)
count 1698.000000
mean 361.769140
std 3222.551118
...
50% NaN
75% NaN
max 109888.000000
Name: quoted_status_favorite_count, dtype: float64
quoted_status_retweet_count (float64)
count 1698.000000
mean 298.006478
std 2681.634140
...
50% NaN
75% NaN
max 84527.000000
Name: quoted_status_retweet_count, dtype: float64
quoted_status_user_favourites_count (float64)
count 1698.000000
mean 7707.139576
std 26101.078684
...
50% NaN
75% NaN
max 640291.000000
Name: quoted_status_user_favourites_count, dtype: float64
quoted_status_user_followers_count (float64)
count 1.698000e+03
mean 3.351078e+05
std 2.200700e+06
...
50% NaN
75% NaN
max 5.992102e+07
Name: quoted_status_user_followers_count, dtype: float64
quoted_status_user_friends_count (float64)
count 1698.000000
mean 3878.302709
std 14818.430933
...
50% NaN
75% NaN
max 181261.000000
Name: quoted_status_user_friends_count, dtype: float64
quoted_status_user_id (float64)
count 1.698000e+03
mean 2.315020e+16
std 1.265821e+17
...
50% NaN
75% NaN
max 7.322411e+17
Name: quoted_status_user_id, dtype: float64
quoted_status_user_id_str (float64)
count 1.698000e+03
mean 2.315020e+16
std 1.265821e+17
...
50% NaN
75% NaN
max 7.322411e+17
Name: quoted_status_user_id_str, dtype: float64
quoted_status_user_listed_count (float64)
count 1698.000000
mean 3456.337456
std 17113.085283
...
50% NaN
75% NaN
max 173929.000000
Name: quoted_status_user_listed_count, dtype: float64
quoted_status_user_statuses_count (float64)
count 1698.000000
mean 26693.128386
std 49781.314248
...
50% NaN
75% NaN
max 354746.000000
Name: quoted_status_user_statuses_count, dtype: float64
quoted_status_user_utc_offset (float64)
count 1406.000000
mean -8585.206259
std 16993.135149
...
50% NaN
75% NaN
max 43200.000000
Name: quoted_status_user_utc_offset, dtype: float64
retweet_count (int64)
count 193378.000000
mean 53.567846
std 877.497404
...
50% 0.000000
75% 4.000000
max 166648.000000
Name: retweet_count, dtype: float64
retweeted_status_favorite_count (float64)
count 69423.000000
mean 156.261441
std 1597.516635
...
50% NaN
75% NaN
max 215360.000000
Name: retweeted_status_favorite_count, dtype: float64
retweeted_status_id (float64)
count 6.942300e+04
mean 7.246058e+17
std 2.900791e+16
...
50% NaN
75% NaN
max 7.345625e+17
Name: retweeted_status_id, dtype: float64
retweeted_status_id_str (float64)
count 6.942300e+04
mean 7.246058e+17
std 2.900791e+16
...
50% NaN
75% NaN
max 7.345625e+17
Name: retweeted_status_id_str, dtype: float64
retweeted_status_in_reply_to_status_id (float64)
count 2.295000e+03
mean 7.242244e+17
std 2.985485e+16
...
50% NaN
75% NaN
max 7.345272e+17
Name: retweeted_status_in_reply_to_status_id, dtype: float64
retweeted_status_in_reply_to_status_id_str (float64)
count 2.295000e+03
mean 7.242244e+17
std 2.985485e+16
...
50% NaN
75% NaN
max 7.345272e+17
Name: retweeted_status_in_reply_to_status_id_str, dtype: float64
retweeted_status_in_reply_to_user_id (float64)
count 2.802000e+03
mean 1.101349e+16
std 8.824095e+16
...
50% NaN
75% NaN
max 7.273329e+17
Name: retweeted_status_in_reply_to_user_id, dtype: float64
retweeted_status_in_reply_to_user_id_str (float64)
count 2.802000e+03
mean 1.101349e+16
std 8.824095e+16
...
50% NaN
75% NaN
max 7.273329e+17
Name: retweeted_status_in_reply_to_user_id_str, dtype: float64
retweeted_status_quoted_status_favorite_count (float64)
count 2162.000000
mean 294.716004
std 6840.594249
...
50% NaN
75% NaN
max 311618.000000
Name: retweeted_status_quoted_status_favorite_count, dtype: float64
retweeted_status_quoted_status_retweet_count (float64)
count 2162.000000
mean 319.838575
std 8842.632116
...
50% NaN
75% NaN
max 406556.000000
Name: retweeted_status_quoted_status_retweet_count, dtype: float64
retweeted_status_quoted_status_user_favourites_count (float64)
count 1071.000000
mean 4255.130719
std 13101.264209
...
50% NaN
75% NaN
max 269482.000000
Name: retweeted_status_quoted_status_user_favourites_count, dtype: float64
retweeted_status_quoted_status_user_followers_count (float64)
count 1.071000e+03
mean 6.740821e+05
std 1.972163e+06
...
50% NaN
75% NaN
max 6.762776e+06
Name: retweeted_status_quoted_status_user_followers_count, dtype: float64
retweeted_status_quoted_status_user_friends_count (float64)
count 1071.000000
mean 3565.981326
std 13356.713457
...
50% NaN
75% NaN
max 96189.000000
Name: retweeted_status_quoted_status_user_friends_count, dtype: float64
retweeted_status_quoted_status_user_id (float64)
count 1.071000e+03
mean 4.383884e+16
std 1.711556e+17
...
50% NaN
75% NaN
max 7.287328e+17
Name: retweeted_status_quoted_status_user_id, dtype: float64
retweeted_status_quoted_status_user_id_str (float64)
count 1.071000e+03
mean 4.383884e+16
std 1.711556e+17
...
50% NaN
75% NaN
max 7.287328e+17
Name: retweeted_status_quoted_status_user_id_str, dtype: float64
retweeted_status_quoted_status_user_listed_count (float64)
count 1071.000000
mean 1733.955182
std 3274.306542
...
50% NaN
75% NaN
max 21608.000000
Name: retweeted_status_quoted_status_user_listed_count, dtype: float64
retweeted_status_quoted_status_user_statuses_count (float64)
count 1071.000000
mean 43782.852474
std 81090.258295
...
50% NaN
75% NaN
max 354746.000000
Name: retweeted_status_quoted_status_user_statuses_count, dtype: float64
retweeted_status_quoted_status_user_utc_offset (float64)
count 969.000000
mean -5814.241486
std 17857.526235
...
50% NaN
75% NaN
max 43200.000000
Name: retweeted_status_quoted_status_user_utc_offset, dtype: float64
retweeted_status_retweet_count (float64)
count 69423.000000
mean 148.140818
std 1459.734542
...
50% NaN
75% NaN
max 166648.000000
Name: retweeted_status_retweet_count, dtype: float64
retweeted_status_user_favourites_count (float64)
count 69423.000000
mean 7552.074874
std 22786.294974
...
50% NaN
75% NaN
max 424498.000000
Name: retweeted_status_user_favourites_count, dtype: float64
retweeted_status_user_followers_count (float64)
count 6.942300e+04
mean 1.420601e+05
std 8.697409e+05
...
50% NaN
75% NaN
max 3.850754e+07
Name: retweeted_status_user_followers_count, dtype: float64
retweeted_status_user_friends_count (float64)
count 6.942300e+04
mean 7.067446e+03
std 3.476161e+04
...
50% NaN
75% NaN
max 4.717316e+06
Name: retweeted_status_user_friends_count, dtype: float64
retweeted_status_user_id (float64)
count 6.942300e+04
mean 3.075018e+16
std 1.450819e+17
...
50% NaN
75% NaN
max 7.340921e+17
Name: retweeted_status_user_id, dtype: float64
retweeted_status_user_id_str (float64)
count 6.942300e+04
mean 3.075018e+16
std 1.450819e+17
...
50% NaN
75% NaN
max 7.340921e+17
Name: retweeted_status_user_id_str, dtype: float64
retweeted_status_user_listed_count (float64)
count 69423.000000
mean 1311.707316
std 5366.950887
...
50% NaN
75% NaN
max 173930.000000
Name: retweeted_status_user_listed_count, dtype: float64
retweeted_status_user_statuses_count (float64)
count 6.942300e+04
mean 2.441676e+04
std 5.399568e+04
...
50% NaN
75% NaN
max 1.277548e+06
Name: retweeted_status_user_statuses_count, dtype: float64
retweeted_status_user_utc_offset (float64)
count 53755.000000
mean -8987.895080
std 15669.797442
...
50% NaN
75% NaN
max 46800.000000
Name: retweeted_status_user_utc_offset, dtype: float64
user_favourites_count (int64)
count 193378.000000
mean 2981.352750
std 12739.631358
...
50% 73.000000
75% 1020.750000
max 673894.000000
Name: user_favourites_count, dtype: float64
user_followers_count (int64)
count 1.933780e+05
mean 3.443857e+03
std 5.943546e+04
...
50% 4.480000e+02
75% 1.141000e+03
max 1.038394e+07
Name: user_followers_count, dtype: float64
user_friends_count (int64)
count 193378.000000
mean 1428.301570
std 5848.678639
...
50% 343.000000
75% 1090.000000
max 382464.000000
Name: user_friends_count, dtype: float64
user_id (int64)
count 1.933780e+05
mean 5.724701e+16
std 1.942174e+17
...
50% 1.492945e+09
75% 3.333417e+09
max 7.342205e+17
Name: user_id, dtype: float64
user_id_str (int64)
count 1.933780e+05
mean 5.724701e+16
std 1.942174e+17
...
50% 1.492945e+09
75% 3.333417e+09
max 7.342205e+17
Name: user_id_str, dtype: float64
user_listed_count (int64)
count 193378.000000
mean 353.925746
std 1126.620779
...
50% 78.000000
75% 248.000000
max 129229.000000
Name: user_listed_count, dtype: float64
user_statuses_count (int64)
count 1.933780e+05
mean 6.134178e+04
std 1.382712e+05
...
50% 1.113600e+04
75% 5.799700e+04
max 2.537204e+06
Name: user_statuses_count, dtype: float64
user_utc_offset (float64)
count 119043.000000
mean -6377.122552
std 18027.953290
...
50% NaN
75% NaN
max 46800.000000
Name: user_utc_offset, dtype: float64
quoted_status_id (float64)
count 0.0
mean NaN
std NaN
...
50% NaN
75% NaN
max NaN
Name: quoted_status_id, dtype: float64
quoted_status_id_str (float64)
count 0.0
mean NaN
std NaN
...
50% NaN
75% NaN
max NaN
Name: quoted_status_id_str, dtype: float64
retweeted_status_quoted_status_id (float64)
count 0.0
mean NaN
std NaN
...
50% NaN
75% NaN
max NaN
Name: retweeted_status_quoted_status_id, dtype: float64
retweeted_status_quoted_status_id_str (float64)
count 0.0
mean NaN
std NaN
...
50% NaN
75% NaN
max NaN
Name: retweeted_status_quoted_status_id_str, dtype: float64
In [13]:
df
Out[13]:
coordinates_coordinates
coordinates_type
created_at
entities_hashtags
entities_media
entities_symbols
entities_urls
entities_user_mentions
favorite_count
favorited
geo_coordinates
geo_type
id_str
in_reply_to_screen_name
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
is_quote_status
lang
lat
lon
metadata_iso_language_code
metadata_result_type
place_bounding_box_coordinates
place_bounding_box_type
place_contained_within
place_country
place_country_code
place_full_name
place_id
place_name
place_place_type
place_url
possibly_sensitive
quoted_status_created_at
quoted_status_entities_hashtags
quoted_status_entities_media
quoted_status_entities_symbols
quoted_status_entities_urls
quoted_status_entities_user_mentions
quoted_status_favorite_count
quoted_status_favorited
quoted_status_is_quote_status
quoted_status_lang
quoted_status_metadata_iso_language_code
quoted_status_metadata_result_type
quoted_status_possibly_sensitive
quoted_status_retweet_count
quoted_status_retweeted
quoted_status_source
quoted_status_text
quoted_status_truncated
quoted_status_user_contributors_enabled
quoted_status_user_created_at
quoted_status_user_default_profile
quoted_status_user_default_profile_image
quoted_status_user_description
quoted_status_user_entities_description_urls
quoted_status_user_entities_url_urls
quoted_status_user_favourites_count
quoted_status_user_followers_count
quoted_status_user_friends_count
quoted_status_user_geo_enabled
quoted_status_user_has_extended_profile
quoted_status_user_id
quoted_status_user_id_str
quoted_status_user_is_translation_enabled
quoted_status_user_is_translator
quoted_status_user_lang
quoted_status_user_listed_count
quoted_status_user_location
quoted_status_user_name
quoted_status_user_profile_background_color
quoted_status_user_profile_background_image_url
quoted_status_user_profile_background_image_url_https
quoted_status_user_profile_background_tile
quoted_status_user_profile_banner_url
quoted_status_user_profile_image_url
quoted_status_user_profile_image_url_https
quoted_status_user_profile_link_color
quoted_status_user_profile_sidebar_border_color
quoted_status_user_profile_sidebar_fill_color
quoted_status_user_profile_text_color
quoted_status_user_profile_use_background_image
quoted_status_user_protected
quoted_status_user_screen_name
quoted_status_user_statuses_count
quoted_status_user_time_zone
quoted_status_user_url
quoted_status_user_utc_offset
quoted_status_user_verified
retweet_count
retweeted
retweeted_status_created_at
retweeted_status_entities_hashtags
retweeted_status_entities_media
retweeted_status_entities_symbols
retweeted_status_entities_urls
retweeted_status_entities_user_mentions
...
retweeted_status_quoted_status_user_profile_sidebar_fill_color
retweeted_status_quoted_status_user_profile_text_color
retweeted_status_quoted_status_user_profile_use_background_image
retweeted_status_quoted_status_user_protected
retweeted_status_quoted_status_user_screen_name
retweeted_status_quoted_status_user_statuses_count
retweeted_status_quoted_status_user_time_zone
retweeted_status_quoted_status_user_url
retweeted_status_quoted_status_user_utc_offset
retweeted_status_quoted_status_user_verified
retweeted_status_retweet_count
retweeted_status_retweeted
retweeted_status_source
retweeted_status_text
retweeted_status_truncated
retweeted_status_user_contributors_enabled
retweeted_status_user_created_at
retweeted_status_user_default_profile
retweeted_status_user_default_profile_image
retweeted_status_user_description
retweeted_status_user_entities_description_urls
retweeted_status_user_entities_url_urls
retweeted_status_user_favourites_count
retweeted_status_user_followers_count
retweeted_status_user_friends_count
retweeted_status_user_geo_enabled
retweeted_status_user_has_extended_profile
retweeted_status_user_id
retweeted_status_user_id_str
retweeted_status_user_is_translation_enabled
retweeted_status_user_is_translator
retweeted_status_user_lang
retweeted_status_user_listed_count
retweeted_status_user_location
retweeted_status_user_name
retweeted_status_user_profile_background_color
retweeted_status_user_profile_background_image_url
retweeted_status_user_profile_background_image_url_https
retweeted_status_user_profile_background_tile
retweeted_status_user_profile_banner_url
retweeted_status_user_profile_image_url
retweeted_status_user_profile_image_url_https
retweeted_status_user_profile_link_color
retweeted_status_user_profile_sidebar_border_color
retweeted_status_user_profile_sidebar_fill_color
retweeted_status_user_profile_text_color
retweeted_status_user_profile_use_background_image
retweeted_status_user_protected
retweeted_status_user_screen_name
retweeted_status_user_statuses_count
retweeted_status_user_time_zone
retweeted_status_user_url
retweeted_status_user_utc_offset
retweeted_status_user_verified
source
text
truncated
user_contributors_enabled
user_created_at
user_default_profile
user_default_profile_image
user_description
user_entities_description_urls
user_entities_url_urls
user_favourites_count
user_followers_count
user_friends_count
user_geo_enabled
user_has_extended_profile
user_id
user_id_str
user_is_translation_enabled
user_is_translator
user_lang
user_listed_count
user_location
user_name
user_profile_background_color
user_profile_background_image_url
user_profile_background_image_url_https
user_profile_background_tile
user_profile_banner_url
user_profile_image_url
user_profile_image_url_https
user_profile_link_color
user_profile_sidebar_border_color
user_profile_sidebar_fill_color
user_profile_text_color
user_profile_use_background_image
user_protected
user_screen_name
user_statuses_count
user_time_zone
user_url
user_utc_offset
user_verified
quoted_status_id
quoted_status_id_str
retweeted_status_quoted_status_id
retweeted_status_quoted_status_id_str
id
731122251278499841
NaN
NaN
Fri May 13 14:01:42 +0000 2016
[{u'indices': [47, 52], u'text': u'Java'}, {u'...
[{u'source_user_id': 150820027, u'source_statu...
[]
[{u'url': u'https://t.co/SVgMAwNxxj', u'indice...
[{u'indices': [3, 17], u'id_str': u'150820027'...
0
False
NaN
NaN
731122251278499841
NaN
NaN
NaN
NaN
NaN
False
en
NaN
NaN
en
recent
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
False
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
14
False
Fri May 13 13:30:47 +0000 2016
[{u'indices': [28, 33], u'text': u'Java'}, {u'...
[{u'expanded_url': u'http://twitter.com/javaco...
[]
[{u'url': u'https://t.co/SVgMAwNxxj', u'indice...
[]
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
14.0
False
<a href="http://bufferapp.com" rel="nofollow">...
Top Performance Metrics for #Java, .NET, #PHP,...
False
False
Tue Jun 01 22:38:53 +0000 2010
False
False
Java developers resource center. JCGs is one o...
[]
[{u'url': u'http://t.co/DivczES801', u'indices...
0.0
90268.0
130.0
False
False
150820027.0
150820027.0
False
False
en
1717.0
NaN
Java Code Geeks
ACDED6
http://abs.twimg.com/images/themes/theme18/bg.gif
https://abs.twimg.com/images/themes/theme18/bg...
False
https://pbs.twimg.com/profile_banners/15082002...
http://pbs.twimg.com/profile_images/2928906892...
https://pbs.twimg.com/profile_images/292890689...
038543
EEEEEE
F6F6F6
333333
True
False
javacodegeeks
37567.0
Athens
http://t.co/DivczES801
10800.0
False
<a href="http://twitter.com" rel="nofollow">Tw...
RT @javacodegeeks: Top Performance Metrics for...
False
False
Wed Aug 12 15:20:38 +0000 2009
False
False
Husband, Father, Programmer, Gamer, Graphic De...
[]
NaN
845
221
709
False
False
65061698
65061698
False
False
en
8
NaN
Greg Herhuth
000000
http://abs.twimg.com/images/themes/theme9/bg.gif
https://abs.twimg.com/images/themes/theme9/bg.gif
False
https://pbs.twimg.com/profile_banners/65061698...
http://pbs.twimg.com/profile_images/7228456300...
https://pbs.twimg.com/profile_images/722845630...
3B94D9
000000
000000
000000
False
False
zamajam
579
Eastern Time (US & Canada)
NaN
-14400.0
False
NaN
NaN
NaN
NaN
724281574129180672
NaN
NaN
Sun Apr 24 16:59:18 +0000 2016
[]
NaN
[]
[{u'url': u'https://t.co/HshSAeTMYc', u'indice...
[]
0
False
NaN
NaN
724281574129180672
NaN
NaN
NaN
NaN
NaN
False
en
NaN
NaN
en
recent
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
False
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0
False
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
<a href="http://twitterfeed.com" rel="nofollow...
World's Largest Python Discovered in Nepal: WA...
False
False
Tue Mar 24 14:13:53 +0000 2015
True
False
NaN
[]
[{u'url': u'http://t.co/mkBfH8QmsX', u'indices...
0
776
1910
True
False
3110463964
3110463964
False
False
en
4
Lokoja, Kogi State, Nigeria.
Ukpe Thompson
C0DEED
http://abs.twimg.com/images/themes/theme1/bg.png
https://abs.twimg.com/images/themes/theme1/bg.png
False
https://pbs.twimg.com/profile_banners/31104639...
http://pbs.twimg.com/profile_images/5852217706...
https://pbs.twimg.com/profile_images/585221770...
0084B4
C0DEED
DDEEF6
333333
True
False
newsymag
2159
Pacific Time (US & Canada)
http://t.co/mkBfH8QmsX
-25200.0
False
NaN
NaN
NaN
NaN
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
724275578879111169
NaN
NaN
Sun Apr 24 16:35:29 +0000 2016
[]
NaN
[]
[{u'indices': [26, 49], u'url': u'https://t.co...
[]
0
False
NaN
NaN
724275578879111169
NaN
NaN
NaN
NaN
NaN
False
en
NaN
NaN
en
recent
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
False
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0
False
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
<a href="https://path.com/" rel="nofollow">Pat...
Watching Boa vs. Python — https://t.co/5THbrirfQO
False
False
Wed Oct 05 01:11:53 +0000 2011
False
False
| Vocal @MEMORIES_MTL | Di Doan Ibu, ku Dengar...
[]
[{u'indices': [0, 22], u'url': u'http://t.co/j...
105
819
275
True
False
385181009
385181009
False
False
id
1
PLBNG - MGL
ﺳﻮﺭﻳﺎ
020305
http://pbs.twimg.com/profile_background_images...
https://pbs.twimg.com/profile_background_image...
True
https://pbs.twimg.com/profile_banners/38518100...
http://pbs.twimg.com/profile_images/7056528218...
https://pbs.twimg.com/profile_images/705652821...
2FC2EF
000000
252429
666666
True
False
bismillah____
59510
Bangkok
http://t.co/jgsHtjOt6x
25200.0
False
NaN
NaN
NaN
NaN
724275568871673857
NaN
NaN
Sun Apr 24 16:35:26 +0000 2016
[]
NaN
[]
[{u'indices': [115, 138], u'url': u'https://t....
[]
0
False
NaN
NaN
724275568871673857
NaN
NaN
NaN
NaN
NaN
False
ru
NaN
NaN
ru
recent
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
False
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0
False
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
<a href="http://vk.com" rel="nofollow">vk.com ...
Чертова дюжина вакансий в IT и Digital / / 1....
False
False
Sun May 22 03:29:30 +0000 2011
False
False
NaN
[]
NaN
1
61
15
False
False
302987528
302987528
False
False
ru
4
Rus
Alex Birgazov
C0DEED
http://pbs.twimg.com/profile_background_images...
https://pbs.twimg.com/profile_background_image...
False
https://pbs.twimg.com/profile_banners/30298752...
http://pbs.twimg.com/profile_images/1364034429...
https://pbs.twimg.com/profile_images/136403442...
0084B4
FFFFFF
DDEEF6
333333
True
False
weelman93
124
Irkutsk
NaN
28800.0
False
NaN
NaN
NaN
NaN
193378 rows × 281 columns
In [18]:
stats = df.describe()
columns = [c for c in stats.columns if stats[c]['count'] > 10000 or 'fav' in c or 'retweet' in c or df[c].dtype in (int, float, np.float64)] + ['text', 'favorite_count', 'geo_coordinates']
print(df.shape)
print(df[columns].shape)
for c in columns:
print(c)
df.text
/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
(193378, 281)
(193378, 60)
favorite_count
id_str
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
lat
lon
quoted_status_favorite_count
quoted_status_retweet_count
quoted_status_user_favourites_count
quoted_status_user_followers_count
quoted_status_user_friends_count
quoted_status_user_id
quoted_status_user_id_str
quoted_status_user_listed_count
quoted_status_user_statuses_count
quoted_status_user_utc_offset
retweet_count
retweeted_status_favorite_count
retweeted_status_id
retweeted_status_id_str
retweeted_status_in_reply_to_status_id
retweeted_status_in_reply_to_status_id_str
retweeted_status_in_reply_to_user_id
retweeted_status_in_reply_to_user_id_str
retweeted_status_quoted_status_favorite_count
retweeted_status_quoted_status_retweet_count
retweeted_status_quoted_status_user_favourites_count
retweeted_status_quoted_status_user_followers_count
retweeted_status_quoted_status_user_friends_count
retweeted_status_quoted_status_user_id
retweeted_status_quoted_status_user_id_str
retweeted_status_quoted_status_user_listed_count
retweeted_status_quoted_status_user_statuses_count
retweeted_status_quoted_status_user_utc_offset
retweeted_status_retweet_count
retweeted_status_user_favourites_count
retweeted_status_user_followers_count
retweeted_status_user_friends_count
retweeted_status_user_id
retweeted_status_user_id_str
retweeted_status_user_listed_count
retweeted_status_user_statuses_count
retweeted_status_user_utc_offset
user_favourites_count
user_followers_count
user_friends_count
user_id
user_id_str
user_listed_count
user_statuses_count
user_utc_offset
quoted_status_id
quoted_status_id_str
retweeted_status_quoted_status_id
retweeted_status_quoted_status_id_str
text
favorite_count
geo_coordinates
Out[18]:
id
731122251278499841 RT @javacodegeeks: Top Performance Metrics for...
724281574129180672 World's Largest Python Discovered in Nepal: WA...
...
724275578879111169 Watching Boa vs. Python — https://t.co/5THbrirfQO
724275568871673857 Чертова дюжина вакансий в IT и Digital / / 1....
Name: text, dtype: object
In [19]:
df[columns].to_csv(os.path.join(DATA_PATH, 'cleaned_tweets.csv.gz'), compression='gzip', encoding='UTF-8', quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)
In [20]:
rawlen = len(df)
df.drop_duplicates('id_str', keep='last', inplace=True)
rawlen - len(df)
Out[20]:
10308
In [21]:
df[columns].to_csv(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), compression='gzip', encoding='UTF-8', quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)
In [ ]:
Content source: totalgood/twip
Similar notebooks: